library(tidyverse)
library(tidyr)
library(fst)
library(plotly)
library(lubridate)
library(ggcorrplot)
library(zoo)
library(ggstance)
library(gganimate)
library(tidyverse)
library(skimr) 
library(naniar)
library(maps) 
library(ggmap)
library(gplots) 
library(RColorBrewer) 
library(sf) 
library(leaflet) 
library(carData)
library(fst)
library(plotly)
rm(list=setdiff(ls(), c()))

data18 <- read_fst("data18-fixed.fst") %>% 
  mutate(OP_UNIQUE_CARRIER = replace(OP_UNIQUE_CARRIER, OP_UNIQUE_CARRIER == "9E", "DL")) %>% 
  mutate(OP_UNIQUE_CARRIER = replace(OP_UNIQUE_CARRIER, OP_UNIQUE_CARRIER == "VX", "AS")) %>% 
  mutate(OP_UNIQUE_CARRIER = replace(OP_UNIQUE_CARRIER, OP_UNIQUE_CARRIER == "MQ", "AA")) %>% 
  mutate(OP_UNIQUE_CARRIER = replace(OP_UNIQUE_CARRIER, OP_UNIQUE_CARRIER == "OH", "AA")) %>% 
  mutate(OP_UNIQUE_CARRIER = replace(OP_UNIQUE_CARRIER, OP_UNIQUE_CARRIER == "AA", "AA - American Airlines")) %>% 
  mutate(OP_UNIQUE_CARRIER = replace(OP_UNIQUE_CARRIER, OP_UNIQUE_CARRIER == "AS", "AA - Alaska Airlines")) %>%
  mutate(OP_UNIQUE_CARRIER = replace(OP_UNIQUE_CARRIER, OP_UNIQUE_CARRIER == "B6", "B6 - JetBlue Airways")) %>%
  mutate(OP_UNIQUE_CARRIER = replace(OP_UNIQUE_CARRIER, OP_UNIQUE_CARRIER == "DL", "DL - Delta Air Lines")) %>%
  mutate(OP_UNIQUE_CARRIER = replace(OP_UNIQUE_CARRIER, OP_UNIQUE_CARRIER == "EV", "EV - ExpressJet Airlines")) %>%
  mutate(OP_UNIQUE_CARRIER = replace(OP_UNIQUE_CARRIER, OP_UNIQUE_CARRIER == "F9", "F9 - Frontier Airlines")) %>%
  mutate(OP_UNIQUE_CARRIER = replace(OP_UNIQUE_CARRIER, OP_UNIQUE_CARRIER == "G4", "G4 - Allegiant Air")) %>%
  mutate(OP_UNIQUE_CARRIER = replace(OP_UNIQUE_CARRIER, OP_UNIQUE_CARRIER == "HA", "HA - Hawaii Airlines")) %>%
  mutate(OP_UNIQUE_CARRIER = replace(OP_UNIQUE_CARRIER, OP_UNIQUE_CARRIER == "NK", "NK - Spirit Airlines")) %>%
  mutate(OP_UNIQUE_CARRIER = replace(OP_UNIQUE_CARRIER, OP_UNIQUE_CARRIER == "OO", "OO - SkyWest Airlines")) %>%
  mutate(OP_UNIQUE_CARRIER = replace(OP_UNIQUE_CARRIER, OP_UNIQUE_CARRIER == "UA", "UA - United Airlines")) %>%
  mutate(OP_UNIQUE_CARRIER = replace(OP_UNIQUE_CARRIER, OP_UNIQUE_CARRIER == "WN", "WN - Southwest Airlines")) %>%
  mutate(OP_UNIQUE_CARRIER = replace(OP_UNIQUE_CARRIER, OP_UNIQUE_CARRIER == "YV", "YV - Mesa Airlines")) %>%
  mutate(OP_UNIQUE_CARRIER = replace(OP_UNIQUE_CARRIER, OP_UNIQUE_CARRIER == "YX", "YX - Republic Airways"))

airports <- read_fst("airport_data.fst")

cols <- c("AA - American Airlines"="#36ace2",
          "AA - Alaska Airlines"="#488509",
          "B6 - JetBlue Airways"="#16339f",
          "DL - Delta Air Lines"="#e01e32",
          "EV - ExpressJet Airlines"="#27446a",
          "F9 - Frontier Airlines"="#176642",
          "G4 - Allegiant Air"="#00569c",
          "HA - Hawaii Airlines"="#ca0f88",
          "NK - Spirit Airlines"="#fcec03",
          "OO - SkyWest Airlines"="steelblue",
          "UA - United Airlines"="#1530a2",
          "WN - Southwest Airlines"="#f9a817",
          "YV - Mesa Airlines"="#aaa9ad",
          "YX - Republic Airways"="black")

What’s the deal with flight delays?

A comprehensive guide to everything you need to know about escaping this frozen tundra for warmer weather

Created for your convenience and pleasure by: Robert M.D. Bui, Alexander J. Marketos, Zoey P.A. Pham, Liv M. Scott

Part 1: Getting to know our project

Everyone hates it when their flight is delayed, right?

What better way to be emotionally prepared to handle delays, than to be informed about them? We have created some plots and apps to help with this.

We wanted to see how flight delays are affected by several variables including: airport, airline, and day of the week.

To collect our data, we found a dataset at the website for the Bureau of Transportation Statistics. Our main challenge was the large size of the data set, as it included every flight in the United States over 2018. Because the file was so large, turning our dataset into an fst file made the set much smaller and did not take as long to load into RStudio. The dates of each flight in were also in one variable (month-day-year), so we had to separate them into month, day, and year.

Part 2: Causes of Delay

How bad are delays, really?

Basics <- tibble(
  Delay = c("Delay", "Delay"),
  Type = c("Overall", "When_Delayed"),
  Minutes = c(9.969858, 38.24306)
)

Basics %>%
  ggplot(aes(x = Delay, y = Minutes, fill = Type)) +
  geom_bar(stat="identity", position = "dodge")+
  scale_fill_viridis_d()+
  theme_minimal()+
  labs(x = "Overal versus when delayed", y = "Delay in Minutes", title = "Average Delay")

Important Variables:

- Carrier delay

- Late Aircraft delay

- NAS delay

- Security delay

- Weather delay

Delay_Cause_Proportions <- tibble(
  Delay_Cause = c("Carrier", "Carrier", "Late_Aircraft", "Late_Aircraft", "NAS", "NAS","Security","Security", "Weather", "Weather"),
  Location = c("U.S.", "MN","U.S.", "MN","U.S.", "MN","U.S.", "MN","U.S.", "MN"),
  Proportion = c(0.280696452, 0.2978381078,0.370349213,  0.2717890005,0.180063432,0.2272410889, 0.001350881, 0.0008780598,0.052455338, 0.1132118506))

Delay_Cause_Proportions %>%
  ggplot(aes(x = Delay_Cause, y = Proportion, fill = Location)) +
  geom_bar(stat="identity", position = "dodge")+
  scale_fill_viridis_d()+
  theme_minimal()+
  labs(x = "Causes of Delay", y = "Proportion of Total Delay", title = "Cause of Delay in Minnesota versus the United States")

Average delay by day of the week, and by cause, at MSP

choice <- "MSP"

sp <- airports %>% 
  filter(
    ORIGIN == choice
  )

choice18 <- data18 %>% 
  filter(
    ORIGIN_AIRPORT_ID == sp$ORIGIN_AIRPORT_ID | DEST_AIRPORT_ID == sp$ORIGIN_AIRPORT_ID
  ) %>% 
  mutate(
    type = ifelse(
      ORIGIN_AIRPORT_ID == sp$ORIGIN_AIRPORT_ID,
      "Departures",
      "Arrivals"
    )
  )

choice18 %>% 
  mutate(
    wkday = wday(FL_DATE,label = T)
  ) %>% 
  group_by(wkday,type) %>% 
  summarise(
    #Actual = mean(DEP_DELAY, na.rm = T),
    Carrier = mean(CARRIER_DELAY, na.rm = T),
    Weather = mean(WEATHER_DELAY, na.rm = T),
    NAS = mean(NAS_DELAY, na.rm = T),
    `Late Aircraft` = mean(LATE_AIRCRAFT_DELAY, na.rm = T),
    Security = mean(SECURITY_DELAY, na.rm = T)
  ) %>% 
  pivot_longer(
    cols = -c(wkday,type),
    names_to = "Cause",
    values_to = "Minutes"
  ) %>% 
  ggplot()+
  geom_col(aes(x=wkday,y=Minutes,fill=Cause), position="stack")+
  theme_minimal()+
  scale_fill_viridis_d()+
  labs(
    x = "Day of the Week",
    y = "Average Delayed Minutes for MSP (NA Removed)"
  )+
  facet_grid(Cause ~ type)+
  coord_flip()

Part 3: Location, Location, Location

Which airports have the worst delays?

airportdelays <- data18 %>%
  group_by(ORIGIN_CITY_NAME) %>%
  summarize(
    avgdelay = mean(DEP_DELAY, na.rm=T),
    number = n()
  ) %>% 
  filter(number>=1000)

airportdelaysname <- data18 %>%
  group_by(ORIGIN_AIRPORT_ID) %>%
  summarise(
    avgdelay = mean(DEP_DELAY, na.rm=T),
    number = n()
  ) %>% filter(number>=1000)

airportdelaysfull <- airportdelays %>%
  inner_join(airportdelaysname, by = "avgdelay")

airportdelaycoords <- airportdelaysfull %>%
  inner_join(airports)

geo <- list(
      #scope = "usa",
      projection = list(
       type = 'orthographic',
       rotation = list(lon = -100, lat = 40, roll = 0)
      ),
      showland = T,
      landcolor = 'transparent',
      countrycolor = 'transparent'
    )
    
p1 <- plot_geo() %>%
      add_markers(
        data = airportdelaycoords, x = ~lon, y = ~lat, 
        text = ~paste("City:", ORIGIN_CITY_NAME, " | ",
                      "Average Delay:", avgdelay), 
        hoverinfo = "text",
        size = ~(avgdelay^3), 
        alpha=.99,
        color = ~avgdelay
      )

ggplotly(p1) %>% 
      layout(
        title = 'Which airports have the worst delays? (Minor Airports Filtered)',
        geo = geo, showlegend = TRUE,
        plot_bgcolor='transparent',
        paper_bgcolor='transparent'
      )

Part 4: Airline

Looking on the bright side, which airlines are best for getting you out of here early?

data18 %>% 
  
  filter(DEP_DELAY <= 0) %>% 
  group_by(OP_UNIQUE_CARRIER) %>% 
  count() %>% 
  
  ggplot(aes(x= reorder(OP_UNIQUE_CARRIER, -n),n, y= n, fill = OP_UNIQUE_CARRIER)) +
  geom_col() +
  coord_flip() +
  labs(fill="Airline",x = "Airline", y= "Number of early departures") +
  scale_y_continuous() +
  theme_minimal()+
  scale_fill_manual(values = cols)

In contrast, which airlines have the worst delays?

Here’s another way to choose an airline, if you want to avoid getting delayed

sorted <- data18 %>% 
  group_by(FL_DATE,OP_UNIQUE_CARRIER) %>% 
  summarise(
    n0=mean(DEP_DELAY,na.rm=T)
  ) %>% 
  group_by(OP_UNIQUE_CARRIER) %>% 
  mutate(
    cumsum=cumsum(n0)
  ) %>% 
  select(
    date=FL_DATE,
    carrier=OP_UNIQUE_CARRIER,
    cumsum=cumsum
  )

carriers <- unique(sorted$carrier)  # vector of all carriers
dates <- unique(sorted$date)  # vector of all dates

cts <- data.frame(carrier=carriers, date=as.Date("2017-12-31"),cumsum=as.integer(0)) 
# adding baseline of 0

sorted2 <- sorted %>% 
  expand(carrier,date=dates)

sorted3 <- left_join(sorted2,sorted)

sorted4 <- bind_rows(sorted3, cts) %>% # adding 1899 baseline to sorted df
  arrange(carrier,date) %>% 
  na.locf()

sorted5 <- sorted4 %>% 
  group_by(date) %>% 
  mutate(rank=rank(-cumsum,ties.method="first")) %>% 
  group_by(carrier) %>% 
  ungroup()

options(digits=2)

statplot <- sorted5 %>% 
  ggplot(aes(x= cumsum, y=rank,color=as.factor(carrier)))+
  geom_barh(stat = "identity", aes(fill=carrier))+   
  geom_text(aes(x=0,color=carrier, label = paste(carrier, " ")),vjust=0.2,hjust=1) +
  geom_text(aes(x=cumsum,label=paste("", trunc(cumsum))),vjust=.5,hjust = 0)+
  scale_y_reverse()+
  coord_cartesian(clip="off",expand=F)+ # disallows clipping of the axes
  guides(color = F, fill = F) +
  scale_fill_manual(values = cols)+
  scale_color_manual(values = cols)+
  theme_minimal()+
  theme(legend.position = "none",
        plot.margin = unit(c(1,1,1,4), "cm"),
        axis.ticks.y = element_blank(),
        axis.text.y  = element_blank())

p <- statplot +
  transition_states(states = date,transition_length = 6, state_length = 4)+
  view_follow(fixed_y=T)+
  ease_aes('quadratic-in-out')+
  enter_drift(x_mod = -1) + exit_drift(x_mod = 1) +
  labs(title = "Race to the bottom: Airline delay minutes",
       x="Accumulated delay minutes, adjusted for number of flights",
       y="",
       caption='{closest_state}')

animate(p,
        nframes = length(unique(sorted5$date))*2,
        fps=24, 
        width=900, 
        height=800)

Part 5: Times & Dates

Flights may be delayed more over different days of the year and times of day

p12 <- data18 %>% 
  group_by(FL_DATE) %>% 
  summarise(
    `Average Delay` = mean(DEP_DELAY,na.rm=T)
  ) %>% 
  ggplot(
    aes(text=paste("Day:",FL_DATE," | Average Delay:",`Average Delay`))
  )+
  geom_bar(
    stat="identity",
    aes(x=FL_DATE,y=`Average Delay`,fill=`Average Delay`)
  )+
    scale_fill_viridis_c()+
    labs(
      x = "Date",
      y = "Average Delay"
    )
 
ggplotly(p12, tooltip = "text")